package cn.chencq.spider.novel.service;

import java.net.Proxy;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cn.chencq.spider.novel.utils.SpiderConfigUtil;
import cn.chencq.spider.novel.utils.Utils;

public abstract class AbstractSpider {

	private static Logger logger = LoggerFactory.getLogger(AbstractSpider.class);

	/**
	 * 抓取指定小说网站的内容
	 * 
	 * @param url
	 * @return
	 * @throws Exception
	 */
	protected Document crawl(String url) {
		Connection conn = Jsoup.connect(url);
		Proxy proxy = Utils.getProxy();
		if (proxy != null) {
			conn.proxy(proxy);
		}
		String host = Utils.getHost(url);
		conn.header("Host", host);

		conn.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36");
		Document doc = null;
		for (int time = 0; time < SpiderConfigUtil.getRetryTime(); time++) {
			try {
				doc = conn.ignoreContentType(true).timeout(3000).get();
				break;
			} catch (Exception e) {
				logger.error("crawl throw exception:{}", e);
				continue;
			}
		}
		return doc;
	}
}
